import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots
import plotly.graph_objects as go
data = pd.read_csv('C:/Users/jsree/OneDrive/Documents/Python projects/Kaggle/Diamond Price/diamonds.csv')
data.head()
| Unnamed: 0 | carat | cut | color | clarity | depth | table | price | x | y | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 0.23 | Ideal | E | SI2 | 61.5 | 55.0 | 326 | 3.95 | 3.98 | 2.43 |
| 1 | 2 | 0.21 | Premium | E | SI1 | 59.8 | 61.0 | 326 | 3.89 | 3.84 | 2.31 |
| 2 | 3 | 0.23 | Good | E | VS1 | 56.9 | 65.0 | 327 | 4.05 | 4.07 | 2.31 |
| 3 | 4 | 0.29 | Premium | I | VS2 | 62.4 | 58.0 | 334 | 4.20 | 4.23 | 2.63 |
| 4 | 5 | 0.31 | Good | J | SI2 | 63.3 | 58.0 | 335 | 4.34 | 4.35 | 2.75 |
print(f'Data contain {data.shape[0]} diamonds and {data.shape[1]} columns.')
Data contain 53940 diamonds and 11 columns.
data.dtypes
Unnamed: 0 int64 carat float64 cut object color object clarity object depth float64 table float64 price int64 x float64 y float64 z float64 dtype: object
data.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Unnamed: 0 | 53940.0 | 26970.500000 | 15571.281097 | 1.0 | 13485.75 | 26970.50 | 40455.25 | 53940.00 |
| carat | 53940.0 | 0.797940 | 0.474011 | 0.2 | 0.40 | 0.70 | 1.04 | 5.01 |
| depth | 53940.0 | 61.749405 | 1.432621 | 43.0 | 61.00 | 61.80 | 62.50 | 79.00 |
| table | 53940.0 | 57.457184 | 2.234491 | 43.0 | 56.00 | 57.00 | 59.00 | 95.00 |
| price | 53940.0 | 3932.799722 | 3989.439738 | 326.0 | 950.00 | 2401.00 | 5324.25 | 18823.00 |
| x | 53940.0 | 5.731157 | 1.121761 | 0.0 | 4.71 | 5.70 | 6.54 | 10.74 |
| y | 53940.0 | 5.734526 | 1.142135 | 0.0 | 4.72 | 5.71 | 6.54 | 58.90 |
| z | 53940.0 | 3.538734 | 0.705699 | 0.0 | 2.91 | 3.53 | 4.04 | 31.80 |
count = data['cut'].value_counts()
count
Ideal 21551 Premium 13791 Very Good 12082 Good 4906 Fair 1610 Name: cut, dtype: int64
sns.displot(data=data, x=data["cut"])
<seaborn.axisgrid.FacetGrid at 0x233cf204370>
sns.barplot(x ="cut", y ="price", data = data)
<AxesSubplot:xlabel='cut', ylabel='price'>
data['price'].describe()
count 53940.000000 mean 3932.799722 std 3989.439738 min 326.000000 25% 950.000000 50% 2401.000000 75% 5324.250000 max 18823.000000 Name: price, dtype: float64
fig_6 = make_subplots(rows=1, cols=1, specs=[[{'type': 'xy'}]])
# Setting Box parameters
fig_6.add_trace(go.Box(x=data['price'],
name='p'))
fig_6.update_traces(marker_color='salmon')
# Setting the parameters of the Box when displaying
fig_6.update_layout(showlegend=False,
template='simple_white',
font=dict(family='Arial',
size=8,
color='black'))
# Displaying the Box
fig_6.show()
count = data['color'].value_counts()
count
G 11292 E 9797 F 9542 H 8304 D 6775 I 5422 J 2808 Name: color, dtype: int64
Display color column Color of Diamonds D, E, F -These are the whitest diamonds G, H – These stones have a very slight hue of yellow (or gray or brown). I, J – These stones have light to medium hue of yellow (or gray or brown). K, L – These stones have a strong hue of yellow (or gray or brown). M, N – These stones have a very strong hue of yellow (or gray or brown). N – Z – Any stone which is more yellow than N and less yellow than Z is called CAPE or DARK CAPE, Fancy Yellow – Any stone which is more yellow than Z is called Fancy Yellow and its value is higher.
sns.displot(data=data, x=data["color"], hue="cut", multiple="stack")
<seaborn.axisgrid.FacetGrid at 0x233cf204c40>
Count the diamonds according to their colors and according to the type of cut
sns.lineplot(data=data, x="carat", y="price")
<AxesSubplot:xlabel='carat', ylabel='price'>
There is a direct relationship between carat (weight of the diamond) and the price of diamonds